Loading packages

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
library(ClassifyR)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: MultiAssayExperiment
## Loading required package: BiocParallel
library(SmokyScotch)
## Warning: replacing previous import 'magrittr::set_names' by
## 'purrr::set_names' when loading 'SmokyScotch'
## Warning: replacing previous import 'ggplot2::margin' by
## 'randomForest::margin' when loading 'SmokyScotch'
## Warning: replacing previous import 'dplyr::combine' by
## 'randomForest::combine' when loading 'SmokyScotch'

Loading data

rawBattingData = read_csv("cricinfo-statsguru-data/Test Matches - Batting.csv") %>% 
  janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
##   Player = col_character(),
##   `Career Span` = col_character(),
##   `Career Start` = col_integer(),
##   `Career End` = col_integer(),
##   `Matches Played` = col_integer(),
##   `Innings Batted` = col_character(),
##   `Not Outs` = col_character(),
##   `Runs Scored` = col_character(),
##   `Highest Innings Score` = col_character(),
##   `Highest Innings Score Num` = col_character(),
##   `Batting Avg` = col_character(),
##   `Hundreds Scored` = col_character(),
##   `Scores Of Fifty Or More` = col_character(),
##   `Ducks Scored` = col_character(),
##   Country = col_character(),
##   `Player Count` = col_integer(),
##   `10000+ Runs Scored` = col_integer(),
##   `50+ Batting Avg` = col_integer()
## )
glimpse(rawBattingData)
## Observations: 2,918
## Variables: 18
## $ player                 <chr> "AN Cook (2006-2018)", "GA Gooch (1975-...
## $ careerSpan             <chr> "2006-2018", "1975-1995", "1990-2003", ...
## $ careerStart            <int> 2006, 1975, 1990, 1978, 2005, 1964, 198...
## $ careerEnd              <int> 2018, 1995, 2003, 1992, 2014, 1982, 200...
## $ matchesPlayed          <int> 156, 118, 133, 117, 104, 108, 115, 118,...
## $ inningsBatted          <chr> "282", "215", "235", "204", "181", "193...
## $ notOuts                <chr> "16", "6", "21", "18", "8", "23", "7", ...
## $ runsScored             <chr> "12145", "8900", "8463", "8231", "8181"...
## $ highestInningsScore    <chr> "294", "333", "190", "215", "227", "246...
## $ highestInningsScoreNum <chr> "294", "333", "190", "215", "227", "246...
## $ battingAvg             <chr> "45.65", "42.58", "39.54", "44.25", "47...
## $ hundredsScored         <chr> "32", "20", "15", "18", "23", "22", "16...
## $ scoresOfFiftyOrMore    <chr> "56", "46", "45", "39", "35", "42", "46...
## $ ducksScored            <chr> "8", "13", "14", "7", "10", "10", "20",...
## $ country                <chr> "England", "England", "England", "Engla...
## $ playerCount            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ x10000RunsScored       <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x50BattingAvg          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...
rawBowlingData = read_csv("cricinfo-statsguru-data/Test Matches - Bowling.csv") %>% 
  janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
##   Player = col_character(),
##   `Innings Bowled In` = col_character(),
##   `Balls Bowled` = col_character(),
##   `Runs Conceded` = col_character(),
##   `Wickets Taken` = col_character(),
##   `Best Bowling In An Innings` = col_character(),
##   `Best Bowling In A Match` = col_character(),
##   `Bowling Avg` = col_character(),
##   `Economy Rate` = col_character(),
##   `Bowling Strike Rate` = col_character(),
##   `Five Wickets In An Innings` = col_character(),
##   `Ten Wickets In A Match` = col_character(),
##   `300+ Wickets Taken` = col_integer(),
##   `<25.00 Bowling Avg` = col_integer()
## )
glimpse(rawBowlingData)
## Observations: 2,918
## Variables: 14
## $ player                 <chr> "JM Anderson (2003-2018)", "SCJ Broad (...
## $ inningsBowledIn        <chr> "257", "215", "168", "165", "127", "151...
## $ ballsBowled            <chr> "30398", "24346", "21815", "17357", "15...
## $ runsConceded           <chr> "14705", "12050", "10878", "8190", "662...
## $ wicketsTaken           <chr> "540", "417", "383", "325", "307", "297...
## $ bestBowlingInAnInnings <chr> "7/42", "8/15", "8/34", "8/43", "8/31",...
## $ bestBowlingInAMatch    <chr> "11/71", "11/121", "13/106", "9/92", "1...
## $ bowlingAvg             <chr> "27.23", "28.89", "28.40", "25.20", "21...
## $ economyRate            <chr> "2.90", "2.96", "2.99", "2.83", "2.61",...
## $ bowlingStrikeRate      <chr> "56.2", "58.3", "56.9", "53.4", "49.4",...
## $ fiveWicketsInAnInnings <chr> "25", "16", "27", "16", "17", "17", "17...
## $ tenWicketsInAMatch     <chr> "3", "2", "4", "0", "3", "6", "3", "1",...
## $ x300WicketsTaken       <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x25_00BowlingAvg       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
rawAllRounderData = read_csv("cricinfo-statsguru-data/Test Matches - All Round.csv") %>% 
  janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
##   Player = col_character(),
##   `Batting-Bowling Avg Diff` = col_double(),
##   `All-Round Ind` = col_integer()
## )
glimpse(rawAllRounderData)
## Observations: 25
## Variables: 3
## $ player                <chr> "AW Greig (1972-1977)", "IT Botham (1977...
## $ battingBowlingAvgDiff <dbl> 8.23, 5.14, 3.22, 0.52, -1.45, -8.28, 13...
## $ allRoundInd           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
inningsThres = 40

Cleaning data

Batting data

cleanedBattingData = rawBattingData %>% 
  dplyr::mutate(
    player = str_replace(player, " \\([^>]+\\)", ""),
    inningsBatted = as.integer(inningsBatted),
    notOuts = as.integer(notOuts), 
    runsScored = as.numeric(runsScored),
    battingAvg = as.numeric(battingAvg),
    hundredsScored = as.integer(hundredsScored),
    scoresOfFiftyOrMore = as.integer(scoresOfFiftyOrMore),
    ducksScored = as.integer(ducksScored)
  ) %>% 
  dplyr::select(
    -careerSpan, 
    -highestInningsScore, 
    -playerCount) %>% 
  dplyr::filter(inningsBatted > inningsThres)
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
glimpse(cleanedBattingData)
## Observations: 620
## Variables: 15
## $ player                 <chr> "AN Cook", "GA Gooch", "AJ Stewart", "D...
## $ careerStart            <int> 2006, 1975, 1990, 1978, 2005, 1964, 198...
## $ careerEnd              <int> 2018, 1995, 2003, 1992, 2014, 1982, 200...
## $ matchesPlayed          <int> 156, 118, 133, 117, 104, 108, 115, 118,...
## $ inningsBatted          <int> 282, 215, 235, 204, 181, 193, 212, 205,...
## $ notOuts                <int> 16, 6, 21, 18, 8, 23, 7, 24, 15, 16, 6,...
## $ runsScored             <dbl> 12145, 8900, 8463, 8231, 8181, 8114, 77...
## $ highestInningsScoreNum <chr> "294", "333", "190", "215", "227", "246...
## $ battingAvg             <dbl> 45.65, 42.58, 39.54, 44.25, 47.28, 47.7...
## $ hundredsScored         <int> 32, 20, 15, 18, 23, 22, 16, 22, 22, 22,...
## $ scoresOfFiftyOrMore    <int> 56, 46, 45, 39, 35, 42, 46, 46, 38, 24,...
## $ ducksScored            <int> 8, 13, 14, 7, 10, 10, 20, 14, 9, 4, 15,...
## $ country                <chr> "England", "England", "England", "Engla...
## $ x10000RunsScored       <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x50BattingAvg          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...

Bowling data

cleanedBowlingData = rawBowlingData %>% 
  dplyr::mutate(
    player = str_replace(player, " \\([^>]+\\)", ""),
    inningsBowledIn = as.integer(inningsBowledIn), 
    ballsBowled = as.integer(ballsBowled), 
    runsConceded = as.integer(runsConceded), 
    wicketsTaken = as.integer(wicketsTaken),
    bowlingAvg = as.numeric(bowlingAvg),
    economyRate = as.numeric(economyRate),
    bowlingStrikeRate = as.numeric(bowlingStrikeRate),
    fiveWicketsInAnInnings = as.integer(fiveWicketsInAnInnings),
    tenWicketsInAMatch = as.integer(tenWicketsInAMatch),
    isBowler = ifelse(wicketsTaken < 50, "Not bowler", "bowler")
  ) %>% 
  tidyr::separate(bestBowlingInAnInnings, 
                  into = c("mostWicketsInnings", "mostWicketsInningsRuns"), 
                  sep = "/") %>% 
  tidyr::separate(bestBowlingInAMatch, 
                  into = c("mostWicketsMatch", "mostWicketsMatchRuns"), 
                  sep = "/") %>% 
  na.omit() %>% 
  dplyr::filter(inningsBowledIn > inningsThres)
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1178 rows
## [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
## 413, 414, 415, 416, 417, ...].

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1178 rows
## [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
## 413, 414, 415, 416, 417, ...].
glimpse(cleanedBowlingData)
## Observations: 325
## Variables: 17
## $ player                 <chr> "JM Anderson", "SCJ Broad", "IT Botham"...
## $ inningsBowledIn        <int> 257, 215, 168, 165, 127, 151, 109, 129,...
## $ ballsBowled            <int> 30398, 24346, 21815, 17357, 15178, 2186...
## $ runsConceded           <int> 14705, 12050, 10878, 8190, 6625, 7674, ...
## $ wicketsTaken           <int> 540, 417, 383, 325, 307, 297, 255, 252,...
## $ mostWicketsInnings     <chr> "7", "8", "8", "8", "8", "8", "6", "7",...
## $ mostWicketsInningsRuns <chr> "42", "15", "34", "43", "31", "51", "65...
## $ mostWicketsMatch       <chr> "11", "11", "13", "9", "12", "13", "10"...
## $ mostWicketsMatchRuns   <chr> "71", "121", "106", "92", "119", "71", ...
## $ bowlingAvg             <dbl> 27.23, 28.89, 28.40, 25.20, 21.57, 25.8...
## $ economyRate            <dbl> 2.90, 2.96, 2.99, 2.83, 2.61, 2.10, 2.9...
## $ bowlingStrikeRate      <dbl> 56.2, 58.3, 56.9, 53.4, 49.4, 73.6, 60....
## $ fiveWicketsInAnInnings <int> 25, 16, 27, 16, 17, 17, 17, 9, 7, 15, 1...
## $ tenWicketsInAMatch     <int> 3, 2, 4, 0, 3, 6, 3, 1, 1, 5, 1, 0, 1, ...
## $ x300WicketsTaken       <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x25_00BowlingAvg       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ isBowler               <chr> "bowler", "bowler", "bowler", "bowler",...

All rounder data

cleanedAllRounderData = rawAllRounderData %>% 
  dplyr::mutate(
    player = str_replace(player, " \\([^>]+\\)", "")
  )

Monocle

library(monocle)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following objects are masked from 'package:ClassifyR':
## 
##     featureNames, sampleNames
## Loading required package: VGAM
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following object is masked from 'package:tidyr':
## 
##     fill
## Loading required package: DDRTree
## Loading required package: irlba
monocle_batting = cleanedBattingData
monocle_matrix = as.matrix(as.data.frame(monocle_batting %>% dplyr::select_if(is.numeric))) %>% t

colnames(monocle_matrix) = cleanedBattingData$player
monocle_pData = new("AnnotatedDataFrame", 
                    as.data.frame(
                      monocle_batting %>%
                        dplyr::left_join(cleanedBowlingData, by = "player") %>% 
                        dplyr::mutate(
                          isBowler = coalesce(isBowler, "Not bowler")
                        )
                    ))

rownames(monocle_pData) = cleanedBattingData$player

identical(colnames(monocle_matrix), rownames(monocle_pData))
## [1] TRUE
monocle_fData_pre = data.frame(gene_short_name = rownames(monocle_matrix))
rownames(monocle_fData_pre) = monocle_fData_pre$gene_short_name
monocle_fData = new("AnnotatedDataFrame", 
                    data = monocle_fData_pre)
identical(rownames(monocle_fData), rownames(monocle_matrix))
## [1] TRUE
cricket <- newCellDataSet(
  monocle_matrix,
  phenoData = monocle_pData,
  featureData = monocle_fData,
  expressionFamily=negbinomial.size())

cricket <- estimateSizeFactors(cricket)
cricket <- estimateDispersions(cricket)
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Removing 2 outliers
diff_test_res <- differentialGeneTest(cricket,
                                      fullModelFormulaStr = "~isBowler")
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
cricket <- setOrderingFilter(cricket, ordering_genes)
plot_ordering_genes(cricket)
## Warning: Transformation introduced infinite values in continuous y-axis

cricket <- reduceDimension(cricket, max_components = 2,
    method = 'DDRTree')

cricket <- orderCells(cricket)

tmp = plot_cell_trajectory(cricket, 
                           color_by = "isBowler", 
                           show_cell_names = TRUE, cell_name_size = 3)
  # geom_text(aes(label = player), size = 1)

tmp

plotly::ggplotly(tmp)

Numeric batting data

numBattingData = cleanedBattingData %>% 
  dplyr::select_if(is.numeric) %>% 
  bind_cols(cleanedBattingData %>% select(player)) %>% 
  dplyr::mutate(
    logRuns = log10(runsScored)
  ) %>% 
  dplyr::select(
    -runsScored, 
    -careerStart, 
    -careerEnd
  ) %>% 
  dplyr::filter(
    !is.infinite(logRuns)
  ) %>% 
  na.omit

dim(numBattingData)
## [1] 620  11
numBattingMatrix = numBattingData %>% 
  dplyr::select(-player) %>% 
  as.data.frame %>% as.matrix

K means Clustering

kmeansObj = kmeans(x = numBattingMatrix, centers = 2)
kmeansObj
## K-means clustering with 2 clusters of sizes 489, 131
## 
## Cluster means:
##   matchesPlayed inningsBatted   notOuts battingAvg hundredsScored
## 1      42.70143       68.9407  8.591002   28.20174       3.231084
## 2     102.69466      168.1756 18.954198   39.54229      15.152672
##   scoresOfFiftyOrMore ducksScored x10000RunsScored x50BattingAvg  logRuns
## 1            8.456033    7.327198       0.00000000    0.02453988 3.151360
## 2           28.862595   12.977099       0.09923664    0.19083969 3.721764
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 1 2 2 2 1 1 2 1 2
##  [36] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2
## [141] 2 2 2 1 2 2 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1
## [316] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 2 2 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 1 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1
## [456] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [561] 2 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [596] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 445437.8 411797.6
##  (between_SS / total_SS =  63.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

PCA

pcaObj = prcomp(x = numBattingMatrix)


library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
venn(
  list(
    battingPlayers = numBattingData$player,
    bowlingPlayers = cleanedBowlingData$player
  )
)

pcaDataFrame = tibble(
  pca1 = pcaObj$x[,1],
  pca2 = pcaObj$x[,2],
  player = numBattingData$player,
  kmeans = as.factor(kmeansObj$cluster)
) %>% 
  dplyr::left_join(cleanedBowlingData, by = "player") %>% 
  dplyr::mutate(
    isBowler = coalesce(isBowler, "Not bowler")
  )

table(pcaDataFrame$isBowler, 
      pcaDataFrame$kmeans)
##             
##                1   2
##   bowler     188  31
##   Not bowler 301 100
p1 = pcaDataFrame %>% 
  ggplot(aes(x = pca1,
             y = pca2,
             colour = isBowler,
             shape = kmeans,
             label = player)) +
  geom_point()


library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plotly::ggplotly(p1)

Full logistic regression

logitData1 = cleanedBattingData %>% 
  dplyr::transmute(player, notOuts, 
                   battingAvg, highestInningsScoreNum, ducksScored) %>% 
  dplyr::left_join(cleanedBowlingData %>% dplyr::select(player, isBowler), by = "player") %>% 
  dplyr::mutate(
    isBowler = coalesce(isBowler, "Not bowler") %>% as.factor(),
    isBatsman = ifelse(isBowler == "bowler", 0L, 1L),
    highestInningsScoreNum = as.numeric(highestInningsScoreNum)
  ) 

logitData2 = logitData1 %>% dplyr::select(-player, -isBowler) %>% na.omit()

logitData2 %>% 
  ggplot(aes(x = factor(isBatsman), y = battingAvg)) +
  geom_boxplot()

fullLogit = glm(isBatsman ~ ., family = "binomial", data = logitData2)

fullLogit %>% 
  broom::tidy() %>% 
  cleanDigits() %>% DT::datatable()
logitObsData = cbind(logitData1, probBatsman = fullLogit$fitted.values) %>% 
  dplyr::mutate(isAllrounder = player %in% cleanedAllRounderData$player)

logitObsData %>% 
  dplyr::filter(
    player %in% c("SK Warne", "GD McGrath", "DG Bradman", "GS Sobers", "MJ Clarke")
  )
##       player notOuts battingAvg highestInningsScoreNum ducksScored
## 1  MJ Clarke      22      49.10                    329           9
## 2 DG Bradman      10      99.94                    334           7
## 3   SK Warne      17      17.32                     99          34
## 4 GD McGrath      51       7.36                     61          35
## 5  GS Sobers      21      57.78                    365          12
##     isBowler isBatsman probBatsman isAllrounder
## 1 Not bowler         1 0.901950262        FALSE
## 2 Not bowler         1 0.999827751        FALSE
## 3     bowler         0 0.147570883        FALSE
## 4     bowler         0 0.003311592        FALSE
## 5     bowler         0 0.962297895         TRUE

Supervised learning

# DMresults <- ClassifyR::runTests(numBattingMatrix, 
#                       classes = factor(pcaDataFrame$isBowler), 
#                       datasetName = "Batting",
#                       classificationName = "Different Means", 
#                       permutations = 20, folds = 5,
#                       seed = 2018, verbose = 1)
# DMresults




library(SmokyScotch)

svmMultiResult = svmCV_multi(x = numBattingMatrix, 
                             y = factor(pcaDataFrame$isBowler),
                             nFolds = 5, nExp = 100,
                             cores = 5)

logitMultiResult = logitCV_multi(
  x = data.frame(numBattingMatrix), 
  y = as.integer(factor(pcaDataFrame$isBowler)) -1L,
  nFolds = 5, nExp = 100,
  cores = 5)

# rfMultiResult = rfCV_multi(
#   x = data.frame(numBattingMatrix), 
#   y = factor(pcaDataFrame$isBowler),
#   nFolds = 5, nExp = 100,
#   cores = 5)

svmMultiError = svmMultiResult %>% purrr::map_dbl("svmMeanError")
logitMultiError = logitMultiResult %>% purrr::map_dbl("logitMeanError")
# rfMultiError = rfMultiResult %>% purrr::map_dbl("rfMeanError")

boxplot(
  data.frame(svmMultiError, 
             logitMultiError
             # rfMultiError
             )
)

predictMatrix1 = purrr::map(svmMultiResult, "svmPredictOrderedVector") %>%
  purrr::map(as.character) %>%
  do.call(rbind,.)

predictMatrix2 = purrr::map(logitMultiResult, "logitPredictIntOrderedVector") %>%
  purrr::map(as.character) %>%
  do.call(rbind,.)

# predictMatrix3 = purrr::map(rfMultiResult, "rfPredictOrderedVector") %>%
#   purrr::map(as.character) %>%
#   do.call(rbind,.)

classifierMatrix = rbind(
  binaryClassScores(y = factor(pcaDataFrame$isBowler), 
                    predictMatrix = predictMatrix1),
  binaryClassScores(y = as.integer(as.factor(pcaDataFrame$isBowler)) -1L, 
                    predictMatrix = predictMatrix2)
  # binaryClassScores(y = factor(pcaDataFrame$isBowler),
  #                   predictMatrix = predictMatrix3)
)
rownames(classifierMatrix) = c("SVM", "Logit")

compareBinaryClassResults(
  y = factor(pcaDataFrame$isBowler), 
  classifierMatrix)

svmData = cbind(pcaDataFrame, 
                svmScore = binaryClassScores(y = factor(pcaDataFrame$isBowler), 
                                             predictMatrix = predictMatrix1)) %>% mutate(isAllRounder = player %in% cleanedAllRounderData$player)



svmData %>% 
  group_by(isAllRounder) %>% 
  summarise(
    meanSvmScore = mean(svmScore)
  )
## # A tibble: 2 x 2
##   isAllRounder meanSvmScore
##   <lgl>               <dbl>
## 1 FALSE               0.870
## 2 TRUE                0.121
svmData %>% 
  ggplot(aes(x = isAllRounder,
             y = svmScore)) +
  geom_boxplot()

Plottiing

cleanedBattingData %>% 
  ggplot(aes(x = battingAvg,
             y = as.numeric(highestInningsScoreNum))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  theme_classic(18) +
  labs(x = "Batting average",
       y = "Highest score")